library(data.table)
library(dplyr)
library(tidyverse)
library(dtplyr)
library(knitr)
library(plotly)
library(widgetframe)

Load both datasets

# load COVID state-level data from NYT
cv_states <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))

# load state population data
state_pops <- as.data.frame(data.table::fread("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
# Merge data sets
cv_states <- merge(cv_states, state_pops, by="state")

2. Look at the data

dim(cv_states)
## [1] 51334     9
head(cv_states)
##     state       date fips   cases deaths geo_id population pop_density abb
## 1 Alabama 2022-10-13    1 1528739  20505      1    4887871    96.50939  AL
## 2 Alabama 2022-03-17    1 1290692  18998      1    4887871    96.50939  AL
## 3 Alabama 2020-10-06    1  160477   2580      1    4887871    96.50939  AL
## 4 Alabama 2021-08-02    1  589110  11536      1    4887871    96.50939  AL
## 5 Alabama 2021-06-08    1  546540  11220      1    4887871    96.50939  AL
## 6 Alabama 2021-07-06    1  551298  11358      1    4887871    96.50939  AL
tail(cv_states)
##         state       date fips  cases deaths geo_id population pop_density abb
## 51329 Wyoming 2022-08-12   56 172473   1865     56     577737    5.950611  WY
## 51330 Wyoming 2021-04-23   56  57696    705     56     577737    5.950611  WY
## 51331 Wyoming 2021-10-19   56  98567   1136     56     577737    5.950611  WY
## 51332 Wyoming 2021-04-22   56  57613    705     56     577737    5.950611  WY
## 51333 Wyoming 2021-12-31   56 115638   1526     56     577737    5.950611  WY
## 51334 Wyoming 2020-11-28   56  31929    215     56     577737    5.950611  WY
str(cv_states)
## 'data.frame':    51334 obs. of  9 variables:
##  $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ date       : IDate, format: "2022-10-13" "2022-03-17" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  1528739 1290692 160477 589110 546540 551298 547135 1302397 1291946 1334981 ...
##  $ deaths     : int  20505 18998 2580 11536 11220 11358 11252 19601 19169 19696 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : chr  "AL" "AL" "AL" "AL" ...

3. Format the data

# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame':    51334 obs. of  9 variables:
##  $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date       : Date, format: "2020-03-13" "2020-03-14" ...
##  $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
##  $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
##  $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
##  $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
##       state       date fips cases deaths geo_id population pop_density abb
## 737 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
## 542 Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
## 971 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
## 283 Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
## 782 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
## 389 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
tail(cv_states)
##         state       date fips  cases deaths geo_id population pop_density abb
## 51200 Wyoming 2022-11-08   56 179366   1917     56     577737    5.950611  WY
## 51256 Wyoming 2022-11-09   56 179366   1917     56     577737    5.950611  WY
## 50948 Wyoming 2022-11-10   56 179366   1917     56     577737    5.950611  WY
## 50489 Wyoming 2022-11-11   56 179366   1917     56     577737    5.950611  WY
## 51225 Wyoming 2022-11-12   56 179366   1917     56     577737    5.950611  WY
## 51198 Wyoming 2022-11-13   56 179366   1917     56     577737    5.950611  WY
# Inspect range of values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
##       state       date fips cases deaths geo_id population pop_density abb
## 737 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
## 542 Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
## 971 Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
## 283 Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
## 782 Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
## 389 Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
summary(cv_states)
##            state            date                 fips           cases         
##  Washington   : 1028   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
##  Illinois     : 1025   1st Qu.:2020-11-03   1st Qu.:16.00   1st Qu.:   89878  
##  California   : 1024   Median :2021-07-08   Median :29.00   Median :  343759  
##  Arizona      : 1023   Mean   :2021-07-07   Mean   :29.78   Mean   :  816807  
##  Massachusetts: 1017   3rd Qu.:2022-03-12   3rd Qu.:44.00   3rd Qu.:  962210  
##  Wisconsin    : 1013   Max.   :2022-11-13   Max.   :72.00   Max.   :11411720  
##  (Other)      :45204                                                          
##      deaths          geo_id        population        pop_density       
##  Min.   :    0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
##  1st Qu.: 1369   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
##  Median : 5122   Median :29.00   Median : 4468402   Median :  107.860  
##  Mean   :11399   Mean   :29.78   Mean   : 6403869   Mean   :  422.946  
##  3rd Qu.:14419   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
##  Max.   :97142   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
##                                                     NA's   :976        
##       abb       
##  WA     : 1028  
##  IL     : 1025  
##  CA     : 1024  
##  AZ     : 1023  
##  MA     : 1017  
##  WI     : 1013  
##  (Other):45204
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2022-11-13"
# Add variables for new_cases and new_deaths


for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  cv_subset = cv_subset[order(cv_subset$date),]
  
  # add starting level for new cases and deaths
  cv_subset$new_cases = cv_subset$cases[1]
  cv_subset$new_deaths = cv_subset$deaths[1]
  

  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
  }
  
  # include in main dataset
  cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2022-06-01")
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`

for (i in 1:length(state_list)) {
  cv_subset = subset(cv_states, state == state_list[i])
  
  # add starting level for new cases and deaths
  cv_subset$cases = cv_subset$cases[1]
  cv_subset$deaths = cv_subset$deaths[1]
  
  ### FINISH CODE HERE
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
  }
  
  # include in main dataset
  cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
library(zoo)

# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)

cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2=NULL

5. Add additional variables

# add population normalized (by 100,000) counts for each variable
cv_states$per100k =  as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k =  as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k =  as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k =  as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` dataset
cv_states_today = subset(cv_states, date==max(cv_states$date))

II. Scatterplots

6. Explore scatterplots using plot_ly()

# pop_density vs. cases
cv_states_today %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# filter out "District of Columbia"
cv_states_today_filter <- cv_states_today %>% filter(state!="District of Columbia")

# pop_density vs. cases after filtering
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~cases, 
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# pop_density vs. deathsper100k
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5))
# Adding hoverinfo
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
          hoverinfo = 'text',
          text = ~paste( paste(state, ":", sep=""), 
                paste(" Cases per 100k: ", per100k, sep="") , 
                paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
  layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
        yaxis = list(title = "Deaths per 100k"), 
         xaxis = list(title = "Population Density"),
         hovermode = "compare")

7. Explore scatterplot trend interactively using ggplotly() and geom_smooth()

  • For pop_density vs. newdeathsper100k create a chart with the same variables using gglot_ly()
  • Explore the pattern between \(x\) and \(y\) using geom_smooth()
    • Explain what you see. Do you think pop_density is a correlate of newdeathsper100k?
### FINISH CODE HERE
p <- ggplot(cv_states_today_filter, aes(x=pop_density, y=deathsper100k, size=population)) + geom_point() + geom_smooth()
ggplotly(p)

_There is perhaps a weak correlation between at higher population densities (from 750 to 1250 there is an upward linear appearing trend). However I doubt this correlation is very strong. The difference in new deaths per 100K is only 100/100,000 or 0.1% different between a population density near 0 and the highest measured, 1250.

8. Multiple line chart

  • Create a line chart of the naive_CFR for all states over time using plot_ly()
    • Use the zoom and pan tools to inspect the naive_CFR for the states that had an increase in September. How have they changed over time?
  • Create one more line chart, for Florida only, which shows new_cases and new_deaths together in one plot. Hint: use add_layer()
    • Use hoverinfo to “eyeball” the approximate peak of deaths and peak of cases. What is the time delay between the peak of cases and the peak of deaths?
# Line chart for naive_CFR for all states over time using `plot_ly()`
plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
# I looked for states that had increases in Sept.. really don't see any.


# Line chart for Florida showing new_cases and new_deaths together
cv_states %>% filter(state=="Florida") %>% plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines") %>% 
  add_lines(x = ~date, y = ~new_deaths, type = "scatter", mode = "lines") 

9. Heatmaps

Create a heatmap to visualize new_cases for each state on each date greater than June 1st, 2021 - Start by mapping selected features in the dataframe into a matrix using the tidyr package function pivot_wider(), naming the rows and columns, as done in the lecture notes - Use plot_ly() to create a heatmap out of this matrix. Which states stand out? - Repeat with newper100k variable. Now which states stand out? - Create a second heatmap in which the pattern of new_cases for each state over time becomes more clear by filtering to only look at dates every two weeks

# Map state, date, and new_cases to a matrix
library(tidyr)
cv_states_mat <- cv_states %>% select(state, date, new_cases) %>% dplyr::filter(date>as.Date("2022-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)
# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)
# Repeat with newper100k
cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% dplyr::filter(date>as.Date("2022-06-15"))
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)


plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2022-06-15"), as.Date("2022-11-01"), by="2 weeks")

cv_states_mat <- cv_states %>% select(state, date, newper100k) %>% filter(date %in% filter_dates)
cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
plot_ly(x=colnames(cv_states_mat2), y=rownames(cv_states_mat2),
             z=~cv_states_mat2,
             type="heatmap",
             showscale=T)

10. Map

  • Create a map to visualize the naive_CFR by state on October 15, 2021
  • Compare with a map visualizing the naive_CFR by state on most recent date
  • Plot the two maps together using subplot(). Make sure the shading is for the same range of values (google is your friend for this)
  • Describe the difference in the pattern of the CFR.
### For specified date
pick.date = "2022-10-15"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% 
  filter(date==pick.date) %>%
  select(state, abb, newper100k, cases, deaths) 
# select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL


# Create hover text
cv_per100$hover <- with(cv_per100, 
                        paste(state_name, '<br>', 
                    "Cases per 100k: ", newper100k, '<br>',
                    "Cases: ", cases, '<br>', 
                    "Deaths: ", deaths))
# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)


# Make sure both maps are on the same color scale
shadeLimit <- 35


# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, 
    text = ~hover, 
    locations = ~state,
    color = ~newper100k, 
    colors = 'Purples'
  )
fig <- fig %>% 
  colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))

fig <- fig %>% 
  layout(
    geo = set_map_details
  )
fig_pick.date <- fig

Map for today’s date

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states_today %>%  
  select(state, abb, newper100k, cases, deaths) 
# select data
cv_per100$state_name <- cv_per100$state
cv_per100$state <- cv_per100$abb
cv_per100$abb <- NULL

# Create hover text
cv_per100$hover <- with(cv_per100, 
            paste(state_name, '<br>', 
            "Cases per 100k: ", newper100k, '<br>',
            "Cases: ", cases, '<br>', 
            "Deaths: ", deaths))

# Create the map
fig <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~newper100k, 
    text = ~hover, 
    locations = ~state,
    color = ~newper100k, 
    colors = 'Purples'
  )
fig <- fig %>% 
  colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
fig <- fig %>% 
  layout(
    geo = set_map_details
  )
fig_Today <- fig
fig_Today
### Plot together 
finalfig <- subplot(fig_pick.date, fig_Today, nrows = 2) %>% 
  layout(showlegend = FALSE,
         title = paste('Cases per 100k by State', 
                       '<br>(Hover for value)'),
         hovermode = TRUE
         ) %>%
  colorbar(title = "Cases per 100k", limits = c(0,shadeLimit))
annotations = list( 
  list( 
    x = 0.5,
    y = 0.5,
    text = pick.date,  
    xref = "paper",  
    yref = "paper",  
    xanchor = "center",  
    yanchor = "bottom",  
    showarrow = FALSE   
  ),  
  list( 
    x = 0.5,
    y = -0.05,
    text = Sys.Date(),  
    xref = "paper",  
    yref = "paper",  
    xanchor = "center",  
    yanchor = "bottom",  
    showarrow = FALSE 
  ))
finalfig <- finalfig %>%layout(annotations = annotations) 
finalfig

_ Between Oct 15th, 2022 and Nov 14th, 2022 there has been a shift in the location of high cases per 100K. In October, Kentucky had the highest case rate. Now in November it is New Mexico. It appears that all the four corners states have more cases in Nov than they did in Oct. Some states however such as MOntana and Florida have a lower case rate in Nov than in Oct. In general New England region appears to have lower cases in Nov than they did in Oct._